import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.ticker as mtick
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import pandas_profiling
from pandas_profiling import ProfileReport
data=pd.read_excel('data_final_2.xlsx',header=0)
data2=data.copy()
list_drop_data=['Unnamed: 0', 'Unnamed: 0.1']
data2.drop(list_drop_data, axis=1, inplace = True)
incapacidad=pd.read_excel('incapacidades_final.xlsx',header=0)
incapacidad2=incapacidad.copy()
list_drop_inc=['Unnamed: 0']
incapacidad2.drop(list_drop_inc, axis=1, inplace = True)
incapacidad2=incapacidad2.assign(INC_TRUE=1)
result = data2.join(incapacidad2.set_index('IDENTIFICADOR'), on='IDENTIFICADOR')
In this graphic, we observe all of the active workers which have been disabled least at once. Moreover, workers are divided according to their gender and cause of disability.
resultplot1=result[result["ACTIVO"]==1]
resultplot1=resultplot1[resultplot1["INC_TRUE"]==1]
resultplot1= resultplot1.groupby(['GENERO','CONTINGENCIA'])['CONTINGENCIA'].count().to_frame(name='TOTAL_INCAPACIDADES')
x_p=list(np.round(resultplot1["TOTAL_INCAPACIDADES"].values/sum(resultplot1["TOTAL_INCAPACIDADES"].values),4)*100)
resultplot1["%"]=x_p
resultplot1.reset_index(drop=False, inplace=True)
plt.figure(figsize=(20,8))
plt.title("Active Workers vs % Type of Disability")
sns.barplot(data=resultplot1,x='CONTINGENCIA', y='%', hue='GENERO')
plt.grid(linestyle = '--', linewidth = 0.5)
plt.show()
It is identified that of the active workers of the company EFICACIA SA; female staff report more disabilities than male staff. In addition, the main reason for disability is General Illness. Work Accidents correspond to 7.42% of the number of disabilities reported by active workers, and women report more work accidents than men during the observed period.
In the first place, we filter the incapacities generated by work accidents. Then, the codes of clients who reported more than 100 work accidents were identified.
resultplot2=result[result['CONTINGENCIA']=="Accidente de Trabajo"]
resultplot2= resultplot2.groupby(['COD_CLIENTE','CONTINGENCIA'])['CONTINGENCIA'].count().to_frame(name='TOTAL_ACCIDENTES')
resultplot2.reset_index(drop=False, inplace=True)
x_p=list(np.round(resultplot2["TOTAL_ACCIDENTES"].values/sum(resultplot2["TOTAL_ACCIDENTES"].values),4)*100)
resultplot2["%"]=x_p
resultplot2.reset_index(drop=False, inplace=True)
resultplot2['TOTAL_ACCIDENTES_100'] = resultplot2['TOTAL_ACCIDENTES'] > 100
resultplot21 = resultplot2[resultplot2['TOTAL_ACCIDENTES_100']==True]
plt.figure(figsize=(20,8))
plt.title("Clients with the highest % Occupational Accidents Reported")
sns.barplot(data=resultplot21,x='CONTINGENCIA', y='%', hue='COD_CLIENTE')
plt.grid(axis = 'y',linestyle = '--', linewidth = 0.5)
plt.show()
The clients of EFICACIA SA whose employees report more than 100 occupational accidents are 7 in total, identified with the codes 105, 169, 236, 482, 1330, 1423, 1444. The percentage of occupational accidents corresponding to the aforementioned clients ranges between 3.68% (client 105) and 10.98% (client 236).
Disabilities due to work accidents represented by geographic region were filtered. Then, the number of accidents per client (called rate) by region.
resultplot221=result[['REGIONAL','COD_CLIENTE']]
resultplot221=resultplot221.groupby('REGIONAL').count()
resultplot221.reset_index(drop=False, inplace=True)
resultplot221.columns=['REGIONAL','TOTAL_CLIENTES']
resultplot222=result[result['CONTINGENCIA']=="Accidente de Trabajo"]
resultplot222= resultplot222.groupby(['REGIONAL','CONTINGENCIA'])['CONTINGENCIA'].count().to_frame(name='TOTAL_ACCIDENTES')
resultplot222.reset_index(drop=False, inplace=True)
resultplot22 = resultplot221.join(resultplot222.set_index('REGIONAL'), on='REGIONAL', how="inner")
x_p=list(np.round(resultplot22["TOTAL_ACCIDENTES"].values/resultplot22["TOTAL_CLIENTES"].values,4))
resultplot22["INDICE_ACCIDENTABILIDAD"]=x_p
plt.figure(figsize=(20,8))
plt.title("Colombian Region VS Occupational Accidents Rate")
sns.barplot(data=resultplot22,x='CONTINGENCIA', y='INDICE_ACCIDENTABILIDAD', hue='REGIONAL')
plt.grid(axis = 'y',linestyle = '--', linewidth = 0.5)
plt.show()
Calculating the total number of accidents reported in each region, divided by the number of clients assigned to each region, the accident rate is obtained (Number of accidents per client in the given region). The region with the highest occupational accident rate is the NORTH region, while the one with the lowest accident rate is the CENTER region; the other regions have a similar accident rate.
After identifying the reported disabilities, they are grouped by the month in which they occurred and the generational group to which the employee belongs.
resultplot3=result[result['INC_TRUE']==1]
date1=resultplot3['FECHA_INICIO'].dt.to_period(freq = 'M')
resultplot3['FECHA_INICIO']=date1.dt.to_timestamp().dt.strftime('%m')
resultplot3= resultplot3.groupby(['FECHA_INICIO','GENERACION'])['GENERACION'].count().to_frame(name='TOTAL_INCAPACIDADES')
resultplot3.reset_index(drop=False, inplace=True)
x_p=list(np.round(resultplot3["TOTAL_INCAPACIDADES"].values/sum(resultplot3["TOTAL_INCAPACIDADES"].values),4)*100)
resultplot3["%"]=x_p
plt.figure(figsize=(20,8))
plt.title("% Disabilities vs Month")
sns.barplot(data=resultplot3,x='FECHA_INICIO', y='%', hue='GENERACION')
plt.show()
It is identified that in the first months of the year more disabilities are reported compared to the last months of the year, and since the generation assigned to the employees of EFICACIA SA are mostly Millennials, this is the population that reports more disabilities, followed by Generation Z, X and BABY BOOMERS respectively.
Employees who have been disabled were filtered and grouped by business area, age, and gender.
resultplot5=result[result['INC_TRUE']==1]
resultplot5 = resultplot5[resultplot5['EDAD'] <= 60]
plt.figure(figsize=(20,8))
plt.title("Bussines Sector vs Age Workers")
plt.xticks(rotation=90)
sns.boxplot(data=resultplot5,x='NEGOCIO', y='EDAD', hue='GENERO')
plt.grid(linestyle = '--', linewidth = 0.5)
plt.show()
In general, the average age of employees who become disabled is 31 years, however, the employees of SENA and RETAIL are usually the business areas where the youngest personnel are disabled. Older employees who become disabled belong to the administrative area, and younger employees who become disabled belong to the SENA area.
Disabilities caused by occupational accidents were plotted according to the timeline in which they occurred. In addition, they were discriminated against by the business area to which the corresponding employees belong.
resultplot4=result[result['CONTINGENCIA']=="Accidente de Trabajo"]
date2=resultplot4['FEC_REP_AT'].dt.to_period(freq = 'M')
resultplot4['FEC_REP_AT']=date2.dt.to_timestamp().dt.strftime('%Y-%m')
resultplot4= resultplot4.groupby(['FEC_REP_AT','NEGOCIO'])['NEGOCIO'].count().to_frame(name='TOTAL_WORK_ACCIDENTS')
resultplot4.reset_index(drop=False, inplace=True)
plt.figure(figsize=(20,8))
plt.title("Bussines Sector vs Work Accidents Reported")
plt.xticks(rotation=90)
sns.lineplot(data=resultplot4,x='FEC_REP_AT', y='TOTAL_WORK_ACCIDENTS', hue='NEGOCIO')
plt.grid(linestyle = '--', linewidth = 0.5)
plt.show()
Historically, Go to Market and Talent & Solutions have been the business areas with the highest number of workplace accidents. These business areas are responsible for advertising products, marketing, and distribution in stores. Additionally, it is evident that since the beginning of 2019 the trend in the number of occupational accidents has increased considerably.
In this space, the correlations, interactions, heat maps, and other visualizations generated by the pandas profiling report are shown.
profile = ProfileReport(result,
title="Risk of occupational accidents- Visualizations",
dataset={
"description": "occupational accidents and disabilities in EFICACIA",
"copyright_holder": "DS4A",
"copyright_year": "2022",
},correlations={
"pearson": {"calculate": True},
"spearman": {"calculate": False},
"kendall": {"calculate": True},
"phi_k": {"calculate": False}}, interactions=None)
profile